Note: Code chunks are collated and echoed at the end of the document in Appendix I.
Generate a digital gene expression list that could be easily shared/loaded for downstream filtering/normalization.
targets <- read_tsv(paste0("./Data/cele_embryonic/embryonic_study_design.txt"),
na = c("", "NA", "na"), show_col_types = F)
# load pre-generated annotation information
load(paste0("./Outputs/elegans_geneAnnotations"))
# import featureCount output into R ----
path <- paste0("./Data/cele_embryonic/featureCount.C_elegans.", targets$Biological_ID, ".CDS.unique_only.ws290.txt")
featureCountData<- rbindlist(lapply(path, fread), idcol="sample") %>%
mutate(sample = targets$sample[sample])
colnames(featureCountData)<-c('sample','geneID', 'Ce_ortholog', 'stableID', "location", "length", "count")
featureCountData_wider <- featureCountData %>%
dplyr::select(!c(Ce_ortholog, location, length)) %>%
pivot_wider(names_from = sample, values_from = count)
counts <- featureCountData_wider %>%
dplyr::select(-stableID)%>%
column_to_rownames(var = "geneID")
annotations_sub<-dplyr::select(featureCountData_wider, geneID) %>%
left_join(annotations, by = "geneID")
# generate a DGEList
myDGEList <- DGEList(counts,
samples = targets$sample,
group = targets$group,
genes = annotations_sub)
output.name <- 'CeleEmbryonic_DGEList'
save(myDGEList,
file = file.path(output.path,
output.name))
The goal of this chunk is to:
ggplot2 to visualize the impact of filtering and
normalization on the data (see Output section, below)._vDGEList). iii) a matrix of variance-stabilized gene
expression data, extracted from the vDGEList
(_log2cpm_filtered_norm_voom.csv) - this data is
downloadable from within the Browser App.
# calculate log2 counts per million and filter the data----
# use the 'cpm' function from EdgeR to get log2 counts per million
# filter genes/transcripts with low counts
# how many genes had more than 1 CPM (TRUE) in at least n samples
# Note: The cutoff "n" is adjusted for the number of
# samples in the smallest group of comparison.
keepers <- cpm(myDGEList) %>%
rowSums(.>1)>=1
myDGEList.filtered <- myDGEList[keepers,]
# Normalize the data using a between samples normalization ----
# Source for TMM sample normalization here:
# https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df<- cpm(myDGEList.filtered.norm, log=TRUE) %>%
as_tibble(rownames = "geneID") %>%
setNames(nm = c("geneID", targets$sample))
# Compute Variance-Stabilized DGEList Object ----
# Set up the design matrix ----
group <- factor(targets$stage)
block <- factor(targets$moreb) #do blocking by polya+ and wholeRNA (by sample date)
# NOTE: For no intercept/blocking for matrix, comparisons across group:
#design <- model.matrix(~0 + group)
#colnames(design) <- levels(group)
# NOTE: To handle a 'blocking' design' or a batch effect, use:
design <- model.matrix(~block + group)
# Model mean-variance trend and fit linear model to data ----
# Use VOOM function from Limma package to model the mean-variance relationship
# produces a variance-stabilized DGEList, that include precision
# weights for each gene to try and control for heteroscedasity.
# transforms count data to log2-counts per million
# Outputs: E = normalized expression values on the log2 scale
v.DGEList.filtered.norm <- voom(counts = myDGEList.filtered.norm,
design = design, plot = T)
colnames(v.DGEList.filtered.norm)<-targets$sample
colnames(v.DGEList.filtered.norm$E) <- paste(targets$block,targets$group,
targets$sample,sep = '-')
# Save matrix of genes and their filtered, normalized, voom-transformed counts ----
# This is the count data that underlies the differential expression analyses in the Shiny app.
# Saving it here so that users of the app can access the input information.
output.name <- 'CeleEmbryonic_log2cpm_filtered_norm_voom.csv'
# Save in Shiny app download (www) folder
write.csv(v.DGEList.filtered.norm$E,
file = file.path(www.path,
output.name))
# Save v.DGEList ----
# Save in Shiny app Data folder
output.name <- 'CeleEmbryonic__vDGEList'
save(v.DGEList.filtered.norm,
file = file.path(app.path,
output.name))
# use the 'cpm' function from EdgeR to get log2 counts per million
# then coerce into a tibble
# Generate life stage IDs
ids <- rep(cbind(targets$group),
times = nrow(myDGEList$counts)) %>%
as_factor()
log2.cpm.df.pivot <- cpm(myDGEList, log=TRUE) %>%
as_tibble(rownames = "geneID") %>%
setNames(nm = c("geneID", targets$sample)) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
add_column(life_stage = ids)
p1 <- ggplot(log2.cpm.df.pivot) +
aes(x=samples, y=expression, fill=life_stage) +
geom_violin(trim = FALSE, show.legend = T, alpha= 0.7) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title = paste0("C. elegans Embryonic: Log2 Counts per Million (CPM)"),
subtitle="unfiltered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
coord_flip()
p1
ids.filtered <- rep(cbind(targets$group),
times = nrow(myDGEList.filtered)) %>%
as_factor()
log2.cpm.filtered.df.pivot <- cpm(myDGEList.filtered, log=TRUE) %>%
as_tibble(rownames = "geneID") %>%
setNames(nm = c("geneID", targets$sample)) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
add_column(life_stage = ids.filtered)
p2 <- ggplot(log2.cpm.filtered.df.pivot) +
aes(x=samples, y=expression, fill=life_stage) +
geom_violin(trim = FALSE, show.legend = T, alpha= 0.7) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title = paste0("C. elegans Embryonic: Log2 Counts per Million (CPM)"),
subtitle="filtered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
coord_flip()
p2
log2.cpm.filtered.norm.df.pivot<-log2.cpm.filtered.norm.df %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
add_column(life_stage = ids.filtered)
p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
aes(x=samples, y=expression, fill=life_stage) +
geom_violin(trim = FALSE, show.legend = T, alpha = 0.7) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title = paste0("C. elegans Embryonic: Log2 Counts per Million (CPM)"),
subtitle="filtered, TMM normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
coord_flip()
p3
# Look at the genes excluded by the filtering step ----
# just to check that there aren't any with
# high expression that are in few samples
# Discarded genes
myDGEList.discarded <- myDGEList[!keepers,]
ids.discarded <- rep(cbind(targets$group),
times = nrow(myDGEList.discarded)) %>%
as_factor()
log2.cpm.discarded.df.pivot <- cpm(myDGEList.discarded, log=F) %>%
as_tibble(rownames = "geneID") %>%
setNames(nm = c("geneID", targets$sample)) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
add_column(life_stage = ids.discarded)
# Genes that are above 1 cpm
log2.cpm.discarded.df.pivot %>%
dplyr::filter(expression > 1)
# Generate a matrix of discarded genes and their raw counts ----
discarded.gene.df <- log2.cpm.discarded.df.pivot %>%
pivot_wider(names_from = c(life_stage, samples),
names_sep = "-",
values_from = expression,
id_cols = geneID)%>%
left_join(annotations, by = "geneID")
# Save a matrix of discarded genes and their raw counts ----
output.name <- 'CeleEmbryonic_discardedGenes.csv'
discarded.gene.df %>%
write.csv(file = file.path(output.path,
output.name))
# Plot discarded genes
p.discarded <- ggplot(log2.cpm.discarded.df.pivot) +
aes(x=samples, y=expression, color=life_stage) +
geom_jitter(alpha = 0.3, show.legend = T)+
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
color = "black",
show.legend = FALSE) +
labs(y="expression", x = "sample",
title = paste0("C. elegans Embryonic: Counts per Million (CPM)"),
subtitle="genes excluded by low count filtering step, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
coord_flip()
p.discarded
This code chunk starts with filtered and normalized abundance data in a data frame (not tidy). It will implement hierarchical clustering and PCA analyses on the data. It will plot various graphs, including a dendrogram of the heirachical clustering, and several plots of visualize the PCA. The goal of this section is to see if these two data sets can be combined for a joint analysis.
# Identify variables of interest in study design file ----
group <- factor(targets$stage)
# Hierarchical clustering ---------------
# Remember: hierarchical clustering can only work on a data matrix, not a data frame
# Calculate distance matrix
# dist calculates distance between rows, so transpose data so that we get distance between samples.
# how similar are samples from each other
colnames(log2.cpm.filtered.norm)<- targets$group
distance <- dist(t(log2.cpm.filtered.norm), method = "maximum") #other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"
# Calculate clusters to visualize differences. This is the hierarchical clustering.
# The methods here include: single (i.e. "friends-of-friends"), complete (i.e. complete linkage), and average (i.e. UPGMA). Here's a comparison of different types: https://en.wikipedia.org/wiki/UPGMA#Comparison_with_other_linkages
clusters <- hclust(distance, method = "complete") #other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
dend <- as.dendrogram(clusters)
p4<-dend %>%
dendextend::set("branches_k_color", k = 5) %>%
dendextend::set("hang_leaves", c(0.1)) %>%
dendextend::set("labels_cex", c(0.5)) %>%
dendextend::set("labels_colors", k = 5) %>%
dendextend::set("branches_lwd", c(0.7)) %>%
as.ggdend %>%
ggplot (offset_labels = -0.5) +
theme_dendro() +
ylim(-8, max(get_branches_heights(dend))) +
labs(title = "Hierarchical Cluster Dendrogram ",
subtitle = "filtered, TMM normalized",
y = "Distance",
x = "Life stage") +
coord_fixed(1/2) +
theme(axis.title.x = element_text(color = "black"),
axis.title.y = element_text(angle = 90),
axis.text.y = element_text(angle = 0),
axis.line.y = element_line(color = "black"),
axis.ticks.y = element_line(color = "black"),
axis.ticks.length.y = unit(2, "mm"))
Clustering performed on filtered and normalized abundance data using
the “complete” method.
# Principal component analysis (PCA) -------------
# this also works on a data matrix, not a data frame
pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
summary(pca.res) # Prints variance summary for all principal components.
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 151.1711 71.3233 52.77388 40.28261 33.84034 27.82526
## Proportion of Variance 0.5582 0.1242 0.06803 0.03963 0.02797 0.01891
## Cumulative Proportion 0.5582 0.6824 0.75045 0.79008 0.81805 0.83696
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 24.18156 20.1370 17.76894 17.36531 16.41680 16.27187
## Proportion of Variance 0.01428 0.0099 0.00771 0.00737 0.00658 0.00647
## Cumulative Proportion 0.85125 0.8611 0.86886 0.87623 0.88281 0.88928
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 15.54952 14.36360 14.1614 13.75806 13.2620 13.12308
## Proportion of Variance 0.00591 0.00504 0.0049 0.00462 0.0043 0.00421
## Cumulative Proportion 0.89518 0.90022 0.9051 0.90974 0.9140 0.91825
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 12.4693 12.31984 11.98556 11.76961 11.4539 10.8951
## Proportion of Variance 0.0038 0.00371 0.00351 0.00338 0.0032 0.0029
## Cumulative Proportion 0.9220 0.92575 0.92926 0.93264 0.9358 0.9387
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 10.77788 10.47822 10.40139 10.22759 9.98258 9.94537
## Proportion of Variance 0.00284 0.00268 0.00264 0.00255 0.00243 0.00242
## Cumulative Proportion 0.94158 0.94427 0.94691 0.94946 0.95190 0.95431
## PC31 PC32 PC33 PC34 PC35 PC36 PC37
## Standard deviation 9.65700 9.54413 9.39232 9.37475 9.24814 9.17675 9.08021
## Proportion of Variance 0.00228 0.00222 0.00215 0.00215 0.00209 0.00206 0.00201
## Cumulative Proportion 0.95659 0.95882 0.96097 0.96312 0.96521 0.96726 0.96928
## PC38 PC39 PC40 PC41 PC42 PC43 PC44
## Standard deviation 8.90145 8.83455 8.47314 8.42971 8.22338 8.0961 8.00872
## Proportion of Variance 0.00194 0.00191 0.00175 0.00174 0.00165 0.0016 0.00157
## Cumulative Proportion 0.97121 0.97312 0.97487 0.97661 0.97826 0.9799 0.98143
## PC45 PC46 PC47 PC48 PC49 PC50 PC51
## Standard deviation 7.80177 7.74659 7.68201 7.50380 7.42730 7.22874 7.10471
## Proportion of Variance 0.00149 0.00147 0.00144 0.00138 0.00135 0.00128 0.00123
## Cumulative Proportion 0.98291 0.98438 0.98582 0.98720 0.98854 0.98982 0.99105
## PC52 PC53 PC54 PC55 PC56 PC57 PC58
## Standard deviation 7.0069 6.7095 6.3853 6.0636 5.94847 5.63466 5.30005
## Proportion of Variance 0.0012 0.0011 0.0010 0.0009 0.00086 0.00078 0.00069
## Cumulative Proportion 0.9922 0.9933 0.9943 0.9952 0.99611 0.99689 0.99757
## PC59 PC60 PC61 PC62 PC63
## Standard deviation 5.25704 5.11576 4.89976 4.64896 1.338e-13
## Proportion of Variance 0.00068 0.00064 0.00059 0.00053 0.000e+00
## Cumulative Proportion 0.99825 0.99889 0.99947 1.00000 1.000e+00
#pca.res$rotation #$rotation shows you how much each gene influenced each PC (called 'scores')
#pca.res$x # 'x' shows you how much each sample influenced each PC (called 'loadings')
#note that these have a magnitude and a direction (this is the basis for making a PCA plot)
## This generates a screeplot: a standard way to view eigenvalues for each PCA. Shows the proportion of variance accounted for by each PC. Plotting only the first 10 dimensions.
p5<-fviz_eig(pca.res,
#barcolor = brewer.pal(8,"Pastel2")[8],
#barfill = brewer.pal(8,"Pastel2")[8],
linecolor = "black",
main = "Scree plot: proportion of variance accounted for by each principal component", ggtheme = theme_bw())
A scree plot is a standard way to view eigenvalues for each PCA. The plot shows the proportion of variance accounted for by each PC.
p5
pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1) # we can then use these eigenvalues to calculate the percentage variance explained by each PC
# Visualize the PCA result ------------------
#lets first plot any two PCs against each other
#We know how much each sample contributes to each PC (loadings), so let's plot
pca.res.df <- as_tibble(pca.res$x)
# Plotting PC1 and PC2
p6<-ggplot(pca.res.df) +
aes(x=PC1, y=PC2, label=targets$stage,
fill = targets$stage,
color = targets$stage,
shape = targets$block
) +
geom_point(size=4) +
xlab(paste0("PC1 (",pc.per[1],"%",")")) +
ylab(paste0("PC2 (",pc.per[2],"%",")")) +
labs(title="Principal Components Analysis",
sub = "Note: analysis is blind to sample identity.",
shape = "Experiment",
fill = "Time point",
color = "Time point") +
scale_x_continuous(expand = c(.3, .3)) +
scale_y_continuous(expand = c(.3, .3)) +
coord_fixed() +
theme_bw() +
theme(text = element_text(size = 10),
title = element_text(size = 10))
Plot of the samples in PCA space. Fill color indicates life
stage.
# Create a PCA 'small multiples' chart ----
pca.res.df <- pca.res$x[,1:6] %>%
as_tibble() %>%
add_column(sample = targets$sample,
group = targets$stage)
pca.pivot <- pivot_longer(pca.res.df, # dataframe to be pivoted
cols = PC1:PC3, # column names to be stored as a SINGLE variable
names_to = "PC", # name of that new variable (column)
values_to = "loadings") # name of new variable (column) storing all the values (data)
PC1<-subset(pca.pivot, PC == "PC1")
PC2 <-subset(pca.pivot, PC == "PC2")
PC3 <- subset(pca.pivot, PC == "PC3")
#PC4 <- subset(pca.pivot, PC == "PC4")
# New facet label names for PCs
PC.labs <- c(paste0("PC1 (",pc.per[1],"%",")"),
paste0("PC2 (",pc.per[2],"%",")"),
paste0("PC3 (",pc.per[3],"%",")")
)
names(PC.labs) <- c("PC1", "PC2", "PC3")
p6<-ggplot(pca.pivot) +
aes(x=sample, y=loadings) + # you could iteratively 'paint' different covariates onto this plot using the 'fill' aes
geom_bar(stat="identity") +
facet_wrap(~PC, labeller = labeller(PC = PC.labs)) +
geom_bar(data = PC1, stat = "identity", aes(fill = group)) +
geom_bar(data = PC2, stat = "identity", aes(fill = group)) +
geom_bar(data = PC3, stat = "identity", aes(fill = group)) +
labs(title="PCA 'small multiples' plot",
fill = "Life Stage",
caption=paste0("produced on ", Sys.time())) +
scale_x_discrete(limits = targets$sample, labels = targets$group) +
theme_bw() +
theme(text = element_text(size = 10),
title = element_text(size = 10)) +
coord_flip()